### Replication code for "Misdemeanor Disenfranchisement?"
### October 2018
### See the readme file for more details on what goes where in this replication package
### Contact Ariel White with questions: arwhi@mit.edu

#note that the commented code at the top of this file is extremely similar to that in "Harris_voterfilemerge.R"; I used the same process to merge the updated (2012-2016) case data to the 2012 and 2016 voter files as was used for the original court records-voter file merge.
#then, at the bottom of this script, the code pulls in the deidentified version of this merged file and replicates the tables/figures in SI Section 7.

###################################################################################################################
## start with defendants, match them to the TX voter file.  (same as in main "Harris_voterfilemerge.R")
#rm(list=ls())
#library(data.table)
#library(lubridate)
#setwd("/home/ariel/Dropbox (MIT)/Texas/Harris_fullsentencing")
#load("harrisfirsttime201216_withfutureanddisps.Rdata") #constructed the same way as the main dataset. 
#harrisoff <- collapsedfirsttime; rm(collapsedfirsttime)

#misdemeanors <- subset(harrisoff, harrisoff$crt<16); dim(misdemeanors)
#summary(misdemeanors$totalsentencedays); summary(misdemeanors$jail); summary(misdemeanors$nonconv); 
#misdemeanors[is.na(totalsentencedays)==T, sentencedays :=0]

##need to parse these names: output just the names
#subset <- misdemeanors
#defnames <- subset$def_nam

#setwd("/home/ariel/Dropbox (MIT)/Texas/HarrisCounty_sentencing")
#write.table(defnames, file= "Harrisco_defendantnames_indsubset.txt", row.names=F, col.names=F, sep=",")

##now run parsing script
#system('python "parsedefendantnames.py"')
##and merge the parsed names back in.
#dnames <- read.table("Harrisco_defendantnames_indsubset_parsed.txt", header=T, sep="|")
#setwd("/home/ariel/Dropbox (MIT)/Texas/Harris_fullsentencing")
#defendants <- data.table(subset, dnames)
#head(defendants)
#save(defendants, file="Harris1216_parseddefendants_withfuture.Rdata") 

##now try matching to the voter file   
#rm(list=ls())
#library(data.table)

#setwd("/nfs/home/A/awhite/shared_space/FLvote/Texas/merging")
#harris <- data.table(read.csv("export.csv", header=T, stringsAsFactors=F)) ##this is actually the whole state file, though named harris.
#load("Harris1216_parseddefendants_withfuture.Rdata") #"defendants".
#library(lubridate)

##match defendants to voter file
##first just try exact matching on last, first names, DOB
#setnames(defendants, "def_yob", "voterYOB")
#defendants[, voterfname := as.character(DefFirstName)]
#defendants[, voterlname := as.character(DefLastName)]
#defendants[, voterfinitial := substr(voterfname, 1,1)]
#names(defendants)

#defendants[, casedate:= ymd(firstcasedate)]
#defendants<- defendants[casedate > "2012-11-06",] 
#harris[,registered :=1]

##label names here.
#setnames(harris, "middle_name", "votermname")
#harris[, voterfname := toupper(first_name)]
#harris[, voterlname := toupper(last_name)]
#harris[, voterfinitial := substr(voterfname, 1, 1)]
#library(lubridate)
#harris[, voterDOB := ymd(born_at)]
#defendants[, voterDOB := ymd(def_dob)]
#defendants[,firstcasedate := NULL] #duplicated column causing problems.

#voter1 <- merge(harris, defendants, by=c("voterlname","voterfinitial", "voterDOB"), allow.cartesian=T, all.y=T)
#dim(voter1); dim(harris); dim(defendants)
#length(unique(voter1$def_spn))
#sum(voter1$registered, na.rm=T)/nrow(voter1)

##now need to trim down duplicates using firstname string match.
#require(stringdist)
#voter1$fnamematchdist <- mapply(stringdist, voter1$voterfname.x, voter1$voterfname.y, method="jw", p=0)
#summary(voter1$fnamematchdist) #runs 0-1, 0 is perfect match.
##note that most are perfect matches (due to birthdate)-- fuzzy first-name matching isn't doing that much work. 

#setkey(voter1, def_spn, fnamematchdist) #sort within birth records by match quality
##then drop any matches that are v. bad, and also keep only the best one of duplicated matches.
#voter1[, n:=1:.N, by=list(def_spn)]
#voter1.1<-voter1[n==1]
#dim(voter1.1); dim(voter1)
#length(unique(voter1.1$def_spn));(dim(voter1.1)) #no more dups.

#voter1.1[fnamematchdist >.2 & is.na(fnamematchdist)==F, registered := 0] #drop out the behavioral stuff for these bad matches, but keep the defendant observations.
#voter1.1[fnamematchdist >.2 & is.na(fnamematchdist)==F, general_2010 := NA]
#voter1.1[fnamematchdist >.2 & is.na(fnamematchdist)==F, general_2008 := NA]
#voter1.1[fnamematchdist >.2 & is.na(fnamematchdist)==F, vh12g1 := NA]
#voter1.1[fnamematchdist >.2 & is.na(fnamematchdist)==F, vh08g1 := NA]

#voter1 <- voter1.1
#rm(voter1.1)
#head(voter1)
#sum(voter1$registered, na.rm=T)

##set up outcome vars for voting
#voter1[, vote2012 := 0]
#voter1[vh12g1>0 & is.na(vh12g1)==F, vote2012 := 1]
#sum(voter1$vote2012, na.rm=T) 

#voter1[is.na(registered)==T, registered:=0]
#voter1[is.na(vote2012)==T, vote2012:=0]
#sum(voter1$registered, na.rm=T)/nrow(voter1)
#sum(voter1$vote2012, na.rm=T)/nrow(voter1)

#sum(voter1$vote2012, na.rm=T)/sum(voter1$registered, na.rm=T)

######################################################################################
##okay, now setup:

##clean up some covars:
#voter1[, male := NA]; voter1[def_sex=="M", male := 1]; voter1[def_sex=="F", male := 0]
#voter1[, black := NA]; voter1[def_rac == "B", black:= 1]; 
#voter1[(is.na(def_rac)==F) & (def_rac != "B"), black := 0]
#voter1[ageatfile < 10950,over30 := 0]
#voter1[ageatfile >= 10950,over30 := 1]

##rename things just so old code works
#setnames(voter1, "anyjail", "jail")
#setnames(voter1, "anyfine", "fine")
#setnames(voter1, "anyprobation", "probation")
#setnames(voter1, "totalsentencedays", "sentencedays")

##calc instruments by year.
#voter1[,crtconvrate1:= mean(anyconv, na.rm=T), by=list(crt, fyear)]
#voter1[,crtsentavg1:= mean(sentencedays, na.rm=T), by=list(crt, fyear)]
#voter1[,crtjailavg1:= mean(jail, na.rm=T), by=list(crt, fyear)]
#voter1[,sent1mavg1:= mean(sent1mplus, na.rm=T), by=list(crt, fyear)]
#voter1[,crtfineavg1:= mean(fine, na.rm=T), by=list(crt, fyear)]
#voter1[,crtprobationavg1:= mean(probation, na.rm=T), by=list(crt, fyear)]

##also create the simplest court dummies.
#inds <- unique(voter1$crt) #15 courtrooms
#voter1[,paste("crt_", (inds), sep=""):=lapply(inds,function(x)crt==x)]

#library(AER)
#library(stargazer)

#summary(voter1$sentencedays)
#dim(voter1)
#voter1 <- voter1[sentencedays <65000,];dim(voter1)
### cut all the summary/descriptive stuff that was here in the other version of the file.

##now, merge the 2017 Harris file in as well, so we can look at 2016 turnout. 
#load("harrisvfile2016.Rdata") #Harris County file collected from Nationbuilder April 2017
#harrisvoters16 <- as.data.table(harrisvoters16)
#defendants <- voter1 #rename so the same code works. 
##match defendants to voter file
#defendants[, VoterFName := as.character(DefFirstName)]
#defendants[, VoterLName := as.character(DefLastName)]
#defendants[, VoterFInitial := substr(VoterFName, 1,1)]

##defendants[, casedate:= ymd(firstcasedate)]
#harrisvoters16[,registeredin17file :=1]

##label names here.
#setnames(harrisvoters16, "middle_name", "VoterMName")
#harrisvoters16[, VoterFName := toupper(first_name)]
#harrisvoters16[, VoterLName := toupper(last_name)]
#harrisvoters16[, VoterFInitial := substr(VoterFName, 1, 1)]
#library(lubridate)
#harrisvoters16[, voter.DOB := ymd(birthdate)]
#defendants[, voter.DOB := ymd(def_dob)]
#defendants[,sentencedays := NULL] #duplicated column causing problems.

#voter1 <- merge(harrisvoters16, defendants, by=c("VoterLName","VoterFInitial", "voter.DOB"), allow.cartesian=T, all.y=T)
#dim(voter1); dim(harrisvoters16); dim(defendants)
#length(unique(voter1$def_spn))
#sum(voter1$registeredin17file, na.rm=T)/nrow(voter1)

##now need to trim down duplicates using firstname string match.
#require(stringdist)
#voter1$fnamematchdist <- mapply(stringdist, voter1$VoterFName.x, voter1$VoterFName.y, method="jw", p=0)
#summary(voter1$fnamematchdist) #runs 0-1, 0 is perfect match.
##note that most are perfect matches (due to birthdate)-- fuzzy first-name matching isn't doing that much work. 

#setkey(voter1, def_spn, fnamematchdist) #sort within birth records by match quality
##then drop any matches that are v. bad, and also keep only the best one of duplicated matches.
#voter1[, n:=1:.N, by=list(def_spn)]
#voter1.1<-voter1[n==1]
#dim(voter1.1); dim(voter1)
#length(unique(voter1.1$def_spn));(dim(voter1.1)) #no more dups.

#voter1.1[fnamematchdist >.2 & is.na(fnamematchdist)==F, registeredinearlyfile := 0] #drop out the behavioral stuff for these bad matches, but keep the defendant observations.
#voter1.1[fnamematchdist >.2 & is.na(fnamematchdist)==F, general_2010 := NA]
#voter1.1[fnamematchdist >.2 & is.na(fnamematchdist)==F, general_2008 := NA]

#voter1 <- voter1.1
#rm(voter1.1)
#head(voter1)
#sum(voter1$registeredin17file, na.rm=T)/nrow(voter1)

##and now set up voter turnout from new file
#table(harrisvoters16$vote_method.General.2016)
#table(voter1$vote_method.General.2016)
#voter1[, voted16:=0]
#voter1[vote_method.General.2016 == "voted", voted16:=1]
#summary(voter1$voted16)

##now generate the deidentified file that will be used for replication.
#colnames(voter1)
#voter1full <- copy(voter1)

#keep <- c("fyear", "registered", "crt", "voterYOB", "def_sex", "def_rac", "ageatfile", "sentencedays", "jail", "fine", "probation", "anyconv", "mostsevcharge", "numcases", "casedate", "vote2012", "male", "black", "over30", "crtconvrate1","crtsentavg1","crtjailavg1","crtfineavg1", "crtprobationavg1","vote2012", "voted16")

#voter1 <- subset(voter1, select=keep)

#save(voter1, file="defendants_voter1recentcases_deidentified.Rdata")

#############################################################################
## Now main IV analysis but for placebo looking backwards
library(data.table)
load("defendants_voter1recentcases_deidentified.Rdata")


library(AER)
library(stargazer)
#also make courtroom dummies for later.
inds <- unique(voter1$crt) #15 courtrooms
voter1[,paste("crt_", (inds), sep=""):=lapply(inds,function(x)crt==x)]
voter1[, classA:= 0]
voter1[mostsevcharge==5, classA:= 1]

fs2all <- lm(jail ~ crtjailavg1 + fyear, data=voter1); summary(fs2all)
iv2all <- ivreg(vote2012 ~ jail +fyear | crtjailavg1 +fyear, data = voter1); summary(iv2all)

##but worry about racial non-monotonicity: subset the data, regenerate the instruments within-race (though doesn't make huge difference)
black <- voter1[def_rac=="B",]
white <- voter1[def_rac=="W",]
#note that this split captures most but not all defendants in the data.

black[,crtjailavg1u:= mean(jail, na.rm=T), by=list(crt, fyear)]
white[,crtjailavg1u:= mean(jail, na.rm=T), by=list(crt, fyear)]

fs2b <- lm(jail ~ crtjailavg1u + fyear, data=black); summary(fs2b)
iv2b <- ivreg(vote2012 ~ jail +fyear | crtjailavg1u +fyear, data = black); summary(iv2b)

#okay, so this is a useful (updated/more cases) placebo that I might want to swap into the SI. make table. 
fs2w <- lm(jail ~ crtjailavg1u + fyear, data=white); summary(fs2w)
iv2w <- ivreg(vote2012 ~ jail +fyear | crtjailavg1u +fyear, data = white); summary(iv2w)

stargazer(iv2b, iv2all, omit=c("fyear"), omit.labels=c("Year dummies"), add.lines=list( c("First Stage F-Statistic", round(summary(fs2b)$fstatistic[1], digits=2), round(summary(fs2all)$fstatistic[1], digits=2))), label="raceIVpost2012placebo", out="raceIV_post2012placebo.tex", title="Placebo IV estimates: Jail sentences on 2012 voting, by race", omit.stat=c("rsq", "ser"), dep.var.labels   = "Voted 2012", column.labels = c("Black Defendants", "All Defendants")) #Table A37 section 7 of SI 

#####################################################################################
#now rerun the analysis looking forwards to 2016 voting

fs2all <- lm(jail ~ crtjailavg1 + fyear, data=voter1); summary(fs2all)
iv2all <- ivreg(voted16 ~ jail +fyear | crtjailavg1 +fyear, data = voter1); summary(iv2all)

fs2b <- lm(jail ~ crtjailavg1u + fyear, data=black); summary(fs2b)
iv2b <- ivreg(voted16 ~ jail +fyear | crtjailavg1u +fyear, data = black); summary(iv2b)
fs2w <- lm(jail ~ crtjailavg1u + fyear, data=white); summary(fs2w)
iv2w <- ivreg(voted16 ~ jail +fyear | crtjailavg1u +fyear, data = white); summary(iv2w) #interesting. is the sample shifting to be more Latino?
names <- sort(table(white$VoterLName)); tail(names, 20) #yeah, this is a different crowd. not so comparable across time here? 
names <- sort(table(voter1$VoterLName)); tail(names, 20)

stargazer(iv2b, iv2all, omit=c("fyear"), omit.labels=c("Year dummies"), add.lines=list( c("First Stage F-Statistic", round(summary(fs2b)$fstatistic[1], digits=2), round(summary(fs2all)$fstatistic[1], digits=2))), label="raceIVpost2012", out="raceIV_post2012.tex", title="IV estimates: Jail sentences on 2016 voting, by race", omit.stat=c("rsq", "ser"), dep.var.labels   = "Voted 2016", column.labels = c("Black Defendants", "All Defendants")) #Table A36 section 7 of SI 

summary(white$voted16); summary(black$voted16)
summary(white$vote2012); summary(black$vote2012)

########################################################################################################################
### but I'm mostly interested in this question for R5: are voters (from 2012) less likely to end up in harsh courtrooms?
#make some scatterplots like the main balance stuff

voter1[, cases:=1]
courts <- voter1[, list(totalcases = sum(cases),  pctjail = sum(jail)/sum(cases), pctvoted12 = sum(vote2012)/sum(cases)), by=list(crt)]
setkey(courts, crt) #here's everyone

allcourts <- courts 
courts1 <- as.data.frame(allcourts)

pdf("balancescatters_courts_postelectiondefs.pdf", height=8, width=5.5)
chars <- c("pctvoted12")
names <- c("Voted 2012")
for(i in 1:length(chars)){
vec <- courts1[, chars[i]] 
label <- paste("Percent", names[i])
scatter.smooth(courts1$pctjail ~ vec, main=names[i], xlab=label, ylab="Courtroom Jail Rate", span=1.5, ylim=c(.45, .65))
}
dev.off()

#and maybe look at just the percent-voted-2012 panel for each year?

courts <- voter1[, list(totalcases = sum(cases), pctjail = sum(jail)/sum(cases), pctvoted12 = sum(vote2012)/sum(cases)), by=list(crt, fyear)]
setkey(courts, crt) #here's everyone

allcourts <- courts [!(fyear=="2012"), ] #drop that one month 
courts1 <- as.data.frame(allcourts)

pdf("balancescatters_courts_postelectiondefs_byyear.pdf", height=8, width=5.5)
par(mfrow=c(2,2), mai = c(.5, 0.4, 0.3, 0.1), mgp=c(2, 1, 0))
chars <- c("2013", "2014",  "2015", "2016")

for(i in 1:length(chars)){
vec <- courts1[courts1$fyear==chars[i], "pctvoted12"] 
label <- "Percent Voted 2012"
scatter.smooth(courts1[courts1$fyear==chars[i],]$pctjail ~ vec, main=chars[i], xlab=label, ylab="Courtroom Jail Rate", span=1.5, ylim=c(.4, .7))
}
dev.off()

